Given a directed social graph, have to predict missing links to recommend users (Link Prediction in graph)
Taken data from facebook's recruting challenge on kaggle https://www.kaggle.com/c/FacebookRecruiting
data contains two columns source and destination eac edge in graph
- Data columns (total 2 columns):
- source_node int64
- destination_node int64
#Importing Libraries
# please do go through this python notebook:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd#pandas to create small dataframes
import datetime #Convert to unix time
import time #Convert to unix time
# if numpy is not installed already : pip3 install numpy
import numpy as np#Do aritmetic operations on arrays
# matplotlib: used to plot graphs
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns#Plots
from matplotlib import rcParams#Size of plots
from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering
import math
import pickle
import os
# to install xgboost: pip3 install xgboost
import xgboost as xgb
import warnings
import networkx as nx
import pdb
import pickle
#reading graph
if not os.path.isfile('train_woheader.csv'):
traincsv = pd.read_csv('data/train.csv')
print(traincsv[traincsv.isna().any(1)])
print(traincsv.info())
print("Number of diplicate entries: ",sum(traincsv.duplicated()))
traincsv.to_csv('data/after_eda/train_woheader.csv',header=False,index=False)
print("saved the graph into file")
else:
g=nx.read_edgelist('train_woheader.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
print(nx.info(g))
Displaying a sub graph
if not os.path.isfile('train_woheader_sample.csv'):
pd.read_csv('data/train.csv', nrows=50).to_csv('train_woheader_sample.csv',header=False,index=False)
subgraph=nx.read_edgelist('train_woheader_sample.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
# https://stackoverflow.com/questions/9402255/drawing-a-huge-graph-with-networkx-and-matplotlib
pos=nx.spring_layout(subgraph)
nx.draw(subgraph,pos,node_color='#A0CBE2',edge_color='#00bb5e',width=1,edge_cmap=plt.cm.Blues,with_labels=True)
plt.savefig("graph_sample.pdf")
print(nx.info(subgraph))
# No of Unique persons
print("The number of unique persons",len(g.nodes()))
indegree_dist = list(dict(g.in_degree()).values())
indegree_dist.sort()
plt.figure(figsize=(10,6))
plt.plot(indegree_dist)
plt.xlabel('Index No')
plt.ylabel('No Of Followers')
plt.show()
indegree_dist = list(dict(g.in_degree()).values())
indegree_dist.sort()
plt.figure(figsize=(10,6))
plt.plot(indegree_dist[0:1500000])
plt.xlabel('Index No')
plt.ylabel('No Of Followers')
plt.show()
plt.boxplot(indegree_dist)
plt.ylabel('No Of Followers')
plt.show()
### 90-100 percentile
for i in range(0,11):
print(90+i,'percentile value is',np.percentile(indegree_dist,90+i))
99% of data having followers of 40 only.
### 99-100 percentile
for i in range(10,110,10):
print(99+(i/100),'percentile value is',np.percentile(indegree_dist,99+(i/100)))
%matplotlib inline
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.distplot(indegree_dist, color='#16A085')
plt.xlabel('PDF of Indegree')
sns.despine()
#plt.show()
outdegree_dist = list(dict(g.out_degree()).values())
outdegree_dist.sort()
plt.figure(figsize=(10,6))
plt.plot(outdegree_dist)
plt.xlabel('Index No')
plt.ylabel('No Of people each person is following')
plt.show()
indegree_dist = list(dict(g.in_degree()).values())
indegree_dist.sort()
plt.figure(figsize=(10,6))
plt.plot(outdegree_dist[0:1500000])
plt.xlabel('Index No')
plt.ylabel('No Of people each person is following')
plt.show()
plt.boxplot(indegree_dist)
plt.ylabel('No Of people each person is following')
plt.show()
### 90-100 percentile
for i in range(0,11):
print(90+i,'percentile value is',np.percentile(outdegree_dist,90+i))
### 99-100 percentile
for i in range(10,110,10):
print(99+(i/100),'percentile value is',np.percentile(outdegree_dist,99+(i/100)))
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.distplot(outdegree_dist, color='#16A085')
plt.xlabel('PDF of Outdegree')
sns.despine()
print('No of persons those are not following anyone are' ,sum(np.array(outdegree_dist)==0),'and % is',
sum(np.array(outdegree_dist)==0)*100/len(outdegree_dist) )
print('No of persons having zero followers are' ,sum(np.array(indegree_dist)==0),'and % is',
sum(np.array(indegree_dist)==0)*100/len(indegree_dist) )
count=0
for i in g.nodes():
if len(list(g.predecessors(i)))==0 :
if len(list(g.successors(i)))==0:
count+=1
print('No of persons those are not not following anyone and also not having any followers are',count)
from collections import Counter
dict_in = dict(g.in_degree())
dict_out = dict(g.out_degree())
d = Counter(dict_in) + Counter(dict_out)
in_out_degree = np.array(list(d.values()))
in_out_degree_sort = sorted(in_out_degree)
plt.figure(figsize=(10,6))
plt.plot(in_out_degree_sort)
plt.xlabel('Index No')
plt.ylabel('No Of people each person is following + followers')
plt.show()
in_out_degree_sort = sorted(in_out_degree)
plt.figure(figsize=(10,6))
plt.plot(in_out_degree_sort[0:1500000])
plt.xlabel('Index No')
plt.ylabel('No Of people each person is following + followers')
plt.show()
### 90-100 percentile
for i in range(0,11):
print(90+i,'percentile value is',np.percentile(in_out_degree_sort,90+i))
### 99-100 percentile
for i in range(10,110,10):
print(99+(i/100),'percentile value is',np.percentile(in_out_degree_sort,99+(i/100)))
print('Min of no of followers + following is',in_out_degree.min())
print(np.sum(in_out_degree==in_out_degree.min()),' persons having minimum no of followers + following')
print('Max of no of followers + following is',in_out_degree.max())
print(np.sum(in_out_degree==in_out_degree.max()),' persons having maximum no of followers + following')
print('No of persons having followers + following less than 10 are',np.sum(in_out_degree<10))
print('No of weakly connected components',len(list(nx.weakly_connected_components(g))))
count=0
for i in list(nx.weakly_connected_components(g)):
if len(i)==2:
count+=1
print('weakly connected components wit 2 nodes',count)
Generated Bad links from graph which are not in graph and whose shortest path is greater than 2.
%%time
###generating bad edges from given graph
import random
if not os.path.isfile('data/after_eda/missing_edges_final.p'):
#getting all set of edges
r = csv.reader(open('data/after_eda/train_woheader.csv','r'))
edges = dict()
for edge in r:
edges[(edge[0], edge[1])] = 1
missing_edges = set([])
while (len(missing_edges)<9437519):
a=random.randint(1, 1862220)
b=random.randint(1, 1862220)
tmp = edges.get((a,b),-1)
if tmp == -1 and a!=b:
try:
if nx.shortest_path_length(g,source=a,target=b) > 2:
missing_edges.add((a,b))
else:
continue
except:
missing_edges.add((a,b))
else:
continue
pickle.dump(missing_edges,open('data/after_eda/missing_edges_final.p','wb'))
else:
missing_edges = pickle.load(open('data/after_eda/missing_edges_final.p','rb'))
len(missing_edges)
Removed edges from Graph and used as test data and after removing used that graph for creating features for Train and test data
from sklearn.model_selection import train_test_split
if (not os.path.isfile('data/after_eda/train_pos_after_eda.csv')) and (not os.path.isfile('data/after_eda/test_pos_after_eda.csv')):
#reading total data df
df_pos = pd.read_csv('data/train.csv')
df_neg = pd.DataFrame(list(missing_edges), columns=['source_node', 'destination_node'])
print("Number of nodes in the graph with edges", df_pos.shape[0])
print("Number of nodes in the graph without edges", df_neg.shape[0])
#Trian test split
#Spiltting data into 80-20
#positive links and negative links seperatly because we need positive training data only for creating graph
#and for feature generation
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df_pos,np.ones(len(df_pos)),test_size=0.2, random_state=9)
X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(df_neg,np.zeros(len(df_neg)),test_size=0.2, random_state=9)
print('='*60)
print("Number of nodes in the train data graph with edges", X_train_pos.shape[0],"=",y_train_pos.shape[0])
print("Number of nodes in the train data graph without edges", X_train_neg.shape[0],"=", y_train_neg.shape[0])
print('='*60)
print("Number of nodes in the test data graph with edges", X_test_pos.shape[0],"=",y_test_pos.shape[0])
print("Number of nodes in the test data graph without edges", X_test_neg.shape[0],"=",y_test_neg.shape[0])
#removing header and saving
X_train_pos.to_csv('data/after_eda/train_pos_after_eda.csv',header=False, index=False)
X_test_pos.to_csv('data/after_eda/test_pos_after_eda.csv',header=False, index=False)
X_train_neg.to_csv('data/after_eda/train_neg_after_eda.csv',header=False, index=False)
X_test_neg.to_csv('data/after_eda/test_neg_after_eda.csv',header=False, index=False)
else:
#Graph from Traing data only
del missing_edges
if (os.path.isfile('data/after_eda/train_pos_after_eda.csv')) and (os.path.isfile('data/after_eda/test_pos_after_eda.csv')):
train_graph=nx.read_edgelist('data/after_eda/train_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
test_graph=nx.read_edgelist('data/after_eda/test_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
print(nx.info(train_graph))
print(nx.info(test_graph))
# finding the unique nodes in the both train and test graphs
train_nodes_pos = set(train_graph.nodes())
test_nodes_pos = set(test_graph.nodes())
trY_teY = len(train_nodes_pos.intersection(test_nodes_pos))
trY_teN = len(train_nodes_pos - test_nodes_pos)
teY_trN = len(test_nodes_pos - train_nodes_pos)
print('no of people common in train and test -- ',trY_teY)
print('no of people present in train but not present in test -- ',trY_teN)
print('no of people present in test but not present in train -- ',teY_trN)
print(' % of people not there in Train but exist in Test in total Test data are {} %'.format(teY_trN/len(test_nodes_pos)*100))
we have a cold start problem here
#final train and test data sets
if (not os.path.isfile('data/after_eda/train_after_eda.csv')) and \
(not os.path.isfile('data/after_eda/test_after_eda.csv')) and \
(not os.path.isfile('data/train_y.csv')) and \
(not os.path.isfile('data/test_y.csv')) and \
(os.path.isfile('data/after_eda/train_pos_after_eda.csv')) and \
(os.path.isfile('data/after_eda/test_pos_after_eda.csv')) and \
(os.path.isfile('data/after_eda/train_neg_after_eda.csv')) and \
(os.path.isfile('data/after_eda/test_neg_after_eda.csv')):
X_train_pos = pd.read_csv('data/after_eda/train_pos_after_eda.csv', names=['source_node', 'destination_node'])
X_test_pos = pd.read_csv('data/after_eda/test_pos_after_eda.csv', names=['source_node', 'destination_node'])
X_train_neg = pd.read_csv('data/after_eda/train_neg_after_eda.csv', names=['source_node', 'destination_node'])
X_test_neg = pd.read_csv('data/after_eda/test_neg_after_eda.csv', names=['source_node', 'destination_node'])
print('='*60)
print("Number of nodes in the train data graph with edges", X_train_pos.shape[0])
print("Number of nodes in the train data graph without edges", X_train_neg.shape[0])
print('='*60)
print("Number of nodes in the test data graph with edges", X_test_pos.shape[0])
print("Number of nodes in the test data graph without edges", X_test_neg.shape[0])
X_train = X_train_pos.append(X_train_neg,ignore_index=True)
y_train = np.concatenate((y_train_pos,y_train_neg))
X_test = X_test_pos.append(X_test_neg,ignore_index=True)
y_test = np.concatenate((y_test_pos,y_test_neg))
X_train.to_csv('data/after_eda/train_after_eda.csv',header=False,index=False)
X_test.to_csv('data/after_eda/test_after_eda.csv',header=False,index=False)
pd.DataFrame(y_train.astype(int)).to_csv('data/train_y.csv',header=False,index=False)
pd.DataFrame(y_test.astype(int)).to_csv('data/test_y.csv',header=False,index=False)
print("Data points in train data",X_train.shape)
print("Data points in test data",X_test.shape)
print("Shape of traget variable in train",y_train.shape)
print("Shape of traget variable in test", y_test.shape)
#Importing Libraries
# please do go through this python notebook:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd#pandas to create small dataframes
import datetime #Convert to unix time
import time #Convert to unix time
# if numpy is not installed already : pip3 install numpy
import numpy as np#Do aritmetic operations on arrays
# matplotlib: used to plot graphs
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns#Plots
from matplotlib import rcParams#Size of plots
from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering
import math
import pickle
import os
# to install xgboost: pip3 install xgboost
import xgboost as xgb
import warnings
import networkx as nx
import pdb
import pickle
from pandas import HDFStore,DataFrame
from pandas import read_hdf
from scipy.sparse.linalg import svds, eigs
import gc
from tqdm import tqdm
if os.path.isfile('train_pos_after_eda.csv'):
train_graph=nx.read_edgelist('train_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
print(nx.info(train_graph))
else:
print("please run the FB_EDA.ipynb or download the files from drive")
\begin{equation} j = \frac{|X\cap Y|}{|X \cup Y|} \end{equation}
#for followees
def jaccard_for_followees(a,b):
try:
if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0:
return 0
sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
(len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
except:
return 0
return sim
#one test case
print(jaccard_for_followees(273084,1505602))
#node 1635354 not in graph
print(jaccard_for_followees(273084,1505602))
#for followers
def jaccard_for_followers(a,b):
try:
if len(set(train_graph.predecessors(a))) == 0 | len(set(g.predecessors(b))) == 0:
return 0
sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
(len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
return sim
except:
return 0
print(jaccard_for_followers(273084,470294))
#node 1635354 not in graph
print(jaccard_for_followees(669354,1635354))
\begin{equation} CosineDistance = \frac{|X\cap Y|}{|X|\cdot|Y|} \end{equation}
#for followees
def cosine_for_followees(a,b):
try:
if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0:
return 0
sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
(math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
return sim
except:
return 0
print(cosine_for_followees(273084,1505602))
print(cosine_for_followees(273084,1635354))
def cosine_for_followers(a,b):
try:
if len(set(train_graph.predecessors(a))) == 0 | len(set(train_graph.predecessors(b))) == 0:
return 0
sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
(math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
return sim
except:
return 0
print(cosine_for_followers(2,470294))
print(cosine_for_followers(669354,1635354))
PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links.
![]()
Mathematical PageRanks for a simple network, expressed as percentages. (Google uses a logarithmic scale.) Page C has a higher PageRank than Page E, even though there are fewer links to C; the one link to C comes from an important page and hence is of high value. If web surfers who start on a random page have an 85% likelihood of choosing a random link from the page they are currently visiting, and a 15% likelihood of jumping to a page chosen at random from the entire web, they will reach Page E 8.1% of the time. (The 15% likelihood of jumping to an arbitrary page corresponds to a damping factor of 85%.) Without damping, all web surfers would eventually end up on Pages A, B, or C, and all other pages would have PageRank zero. In the presence of damping, Page A effectively links to all pages in the web, even though it has no outgoing links of its own.
if not os.path.isfile('data/fea_sample/page_rank.p'):
pr = nx.pagerank(train_graph, alpha=0.85)
pickle.dump(pr,open('data/fea_sample/page_rank.p','wb'))
else:
pr = pickle.load(open('data/fea_sample/page_rank.p','rb'))
print('min',pr[min(pr, key=pr.get)])
print('max',pr[max(pr, key=pr.get)])
print('mean',float(sum(pr.values())) / len(pr))
#for imputing to nodes which are not there in Train data
mean_pr = float(sum(pr.values())) / len(pr)
print(mean_pr)
Getting Shortest path between two nodes, if nodes have direct path i.e directly connected then we are removing that edge and calculating path.
#if has direct edge then deleting that edge and calculating shortest path
def compute_shortest_path_length(a,b):
p=-1
try:
if train_graph.has_edge(a,b):
train_graph.remove_edge(a,b)
p= nx.shortest_path_length(train_graph,source=a,target=b)
train_graph.add_edge(a,b)
else:
p= nx.shortest_path_length(train_graph,source=a,target=b)
return p
except:
return -1
#testing
compute_shortest_path_length(77697, 826021)
#testing
compute_shortest_path_length(669354,1635354)
#getting weekly connected edges from graph
wcc=list(nx.weakly_connected_components(train_graph))
def belongs_to_same_wcc(a,b):
index = []
if train_graph.has_edge(b,a):
return 1
if train_graph.has_edge(a,b):
for i in wcc:
if a in i:
index= i
break
if (b in index):
train_graph.remove_edge(a,b)
if compute_shortest_path_length(a,b)==-1:
train_graph.add_edge(a,b)
return 0
else:
train_graph.add_edge(a,b)
return 1
else:
return 0
else:
for i in wcc:
if a in i:
index= i
break
if(b in index):
return 1
else:
return 0
belongs_to_same_wcc(861, 1659750)
belongs_to_same_wcc(669354,1635354)
Adamic/Adar measures is defined as inverted sum of degrees of common neighbours for given two vertices. $$A(x,y)=\sum_{u \in N(x) \cap N(y)}\frac{1}{log(|N(u)|)}$$
#adar index
def calc_adar_in(a,b):
sum=0
try:
n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
if len(n)!=0:
for i in n:
sum=sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
return sum
else:
return 0
except:
return 0
calc_adar_in(1,189226)
calc_adar_in(669354,1635354)
def follows_back(a,b):
if train_graph.has_edge(b,a):
return 1
else:
return 0
follows_back(1,189226)
follows_back(669354,1635354)
https://en.wikipedia.org/wiki/Katz_centrality
https://www.geeksforgeeks.org/katz-centrality-centrality-measure/
Katz centrality computes the centrality for a node
based on the centrality of its neighbors. It is a
generalization of the eigenvector centrality. The
Katz centrality for node i is
$$x_i = \alpha \sum_{j} A_{ij} x_j + \beta,$$
where A is the adjacency matrix of the graph G
with eigenvalues $$\lambda$$.
The parameter $$\beta$$ controls the initial centrality and
$$\alpha < \frac{1}{\lambda_{max}}.$$
if not os.path.isfile('data/fea_sample/katz.p'):
katz = nx.katz.katz_centrality(train_graph,alpha=0.005,beta=1)
pickle.dump(katz,open('data/fea_sample/katz.p','wb'))
else:
katz = pickle.load(open('data/fea_sample/katz.p','rb'))
print('min',katz[min(katz, key=katz.get)])
print('max',katz[max(katz, key=katz.get)])
print('mean',float(sum(katz.values())) / len(katz))
mean_katz = float(sum(katz.values())) / len(katz)
print(mean_katz)
The HITS algorithm computes two numbers for a node. Authorities estimates the node value based on the incoming links. Hubs estimates the node value based on outgoing links.
if not os.path.isfile('data/fea_sample/hits.p'):
hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)
pickle.dump(hits,open('data/fea_sample/hits.p','wb'))
else:
hits = pickle.load(open('data/fea_sample/hits.p','rb'))
print('min',hits[0][min(hits[0], key=hits[0].get)])
print('max',hits[0][max(hits[0], key=hits[0].get)])
print('mean',float(sum(hits[0].values())) / len(hits[0]))
import random
if os.path.isfile('train_after_eda.csv'):
filename = "train_after_eda.csv"
# you uncomment this line, if you dont know the lentgh of the file name
# here we have hardcoded the number of lines as 15100030
# n_train = sum(1 for line in open(filename)) #number of records in file (excludes header)
n_train = 15100028
s = 100000 #desired sample size
skip_train = sorted(random.sample(range(1,n_train+1),n_train-s))
#https://stackoverflow.com/a/22259008/4084039
if os.path.isfile('train_after_eda.csv'):
filename = "test_after_eda.csv"
# you uncomment this line, if you dont know the lentgh of the file name
# here we have hardcoded the number of lines as 3775008
# n_test = sum(1 for line in open(filename)) #number of records in file (excludes header)
n_test = 3775006
s = 50000 #desired sample size
skip_test = sorted(random.sample(range(1,n_test+1),n_test-s))
#https://stackoverflow.com/a/22259008/4084039
print("Number of rows in the train data file:", n_train)
print("Number of rows we are going to elimiate in train data are",len(skip_train))
print("Number of rows in the test data file:", n_test)
print("Number of rows we are going to elimiate in test data are",len(skip_test))
df_final_train = pd.read_csv('train_after_eda.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
df_final_train['indicator_link'] = pd.read_csv('train_y.csv', skiprows=skip_train, names=['indicator_link'])
print("Our train matrix size ",df_final_train.shape)
df_final_train.head(2)
df_final_test = pd.read_csv('test_after_eda.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
df_final_test['indicator_link'] = pd.read_csv('test_y.csv', skiprows=skip_test, names=['indicator_link'])
print("Our test matrix size ",df_final_test.shape)
df_final_test.head(2)
we will create these each of these features for both train and test data points
if not os.path.isfile('data/fea_sample/storage_sample_stage1.h5'):
#mapping jaccrd followers to train and test data
df_final_train['jaccard_followers'] = df_final_train.apply(lambda row:
jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
df_final_test['jaccard_followers'] = df_final_test.apply(lambda row:
jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
#mapping jaccrd followees to train and test data
df_final_train['jaccard_followees'] = df_final_train.apply(lambda row:
jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
df_final_test['jaccard_followees'] = df_final_test.apply(lambda row:
jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
#mapping jaccrd followers to train and test data
df_final_train['cosine_followers'] = df_final_train.apply(lambda row:
cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
df_final_test['cosine_followers'] = df_final_test.apply(lambda row:
cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
#mapping jaccrd followees to train and test data
df_final_train['cosine_followees'] = df_final_train.apply(lambda row:
cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
df_final_test['cosine_followees'] = df_final_test.apply(lambda row:
cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
def compute_features_stage1(df_final):
#calculating no of followers followees for source and destination
#calculating intersection of followers and followees for source and destination
num_followers_s=[]
num_followees_s=[]
num_followers_d=[]
num_followees_d=[]
inter_followers=[]
inter_followees=[]
for i,row in df_final.iterrows():
try:
s1=set(train_graph.predecessors(row['source_node']))
s2=set(train_graph.successors(row['source_node']))
except:
s1 = set()
s2 = set()
try:
d1=set(train_graph.predecessors(row['destination_node']))
d2=set(train_graph.successors(row['destination_node']))
except:
d1 = set()
d2 = set()
num_followers_s.append(len(s1))
num_followees_s.append(len(s2))
num_followers_d.append(len(d1))
num_followees_d.append(len(d2))
inter_followers.append(len(s1.intersection(d1)))
inter_followees.append(len(s2.intersection(d2)))
return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees
if not os.path.isfile('stage1_updated.h5'):
df_final_train['num_followers_s'], df_final_train['num_followers_d'], \
df_final_train['num_followees_s'], df_final_train['num_followees_d'], \
df_final_train['inter_followers'], df_final_train['inter_followees']= compute_features_stage1(df_final_train)
df_final_test['num_followers_s'], df_final_test['num_followers_d'], \
df_final_test['num_followees_s'], df_final_test['num_followees_d'], \
df_final_test['inter_followers'], df_final_test['inter_followees']= compute_features_stage1(df_final_test)
hdf = HDFStore('data/fea_sample/storage_sample_stage1.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()
else:
df_final_train = read_hdf('data/fea_sample/storage_sample_stage1.h5', 'train_df',mode='r')
df_final_test = read_hdf('data/fea_sample/storage_sample_stage1.h5', 'test_df',mode='r')
we will create these each of these features for both train and test data points
if not os.path.isfile('data/fea_sample/storage_sample_stage2.h5'):
#mapping adar index on train
df_final_train['adar_index'] = df_final_train.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)
#mapping adar index on test
df_final_test['adar_index'] = df_final_test.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)
#--------------------------------------------------------------------------------------------------------
#mapping followback or not on train
df_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)
#mapping followback or not on test
df_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)
#--------------------------------------------------------------------------------------------------------
#mapping same component of wcc or not on train
df_final_train['same_comp'] = df_final_train.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)
##mapping same component of wcc or not on train
df_final_test['same_comp'] = df_final_test.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)
#--------------------------------------------------------------------------------------------------------
#mapping shortest path on train
df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
#mapping shortest path on test
df_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
hdf = HDFStore('data/fea_sample/storage_sample_stage2.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()
else:
df_final_train = read_hdf('data/fea_sample/storage_sample_stage2.h5', 'train_df',mode='r')
df_final_test = read_hdf('data/fea_sample/storage_sample_stage2.h5', 'test_df',mode='r')
we will create these each of these features for both train and test data points
In order to determine the similarity of nodes, an edge weight value was calculated between nodes. Edge weight decreases as the neighbor count goes up. Intuitively, consider one million people following a celebrity on a social network then chances are most of them never met each other or the celebrity. On the other hand, if a user has 30 contacts in his/her social network, the chances are higher that many of them know each other.
credit - Graph-based Features for Supervised Link Prediction
William Cukierski, Benjamin Hamner, Bo Yang
\begin{equation} W = \frac{1}{\sqrt{1+|X|}} \end{equation}
it is directed graph so calculated Weighted in and Weighted out differently
#weight for source and destination of each link
Weight_in = {}
Weight_out = {}
for i in tqdm(train_graph.nodes()):
s1=set(train_graph.predecessors(i))
w_in = 1.0/(np.sqrt(1+len(s1)))
Weight_in[i]=w_in
s2=set(train_graph.successors(i))
w_out = 1.0/(np.sqrt(1+len(s2)))
Weight_out[i]=w_out
#for imputing with mean
mean_weight_in = np.mean(list(Weight_in.values()))
mean_weight_out = np.mean(list(Weight_out.values()))
if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'):
#mapping to pandas train
df_final_train['weight_in'] = df_final_train.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_train['weight_out'] = df_final_train.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))
#mapping to pandas test
df_final_test['weight_in'] = df_final_test.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_test['weight_out'] = df_final_test.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))
#some features engineerings on the in and out weights
df_final_train['weight_f1'] = df_final_train.weight_in + df_final_train.weight_out
df_final_train['weight_f2'] = df_final_train.weight_in * df_final_train.weight_out
df_final_train['weight_f3'] = (2*df_final_train.weight_in + 1*df_final_train.weight_out)
df_final_train['weight_f4'] = (1*df_final_train.weight_in + 2*df_final_train.weight_out)
#some features engineerings on the in and out weights
df_final_test['weight_f1'] = df_final_test.weight_in + df_final_test.weight_out
df_final_test['weight_f2'] = df_final_test.weight_in * df_final_test.weight_out
df_final_test['weight_f3'] = (2*df_final_test.weight_in + 1*df_final_test.weight_out)
df_final_test['weight_f4'] = (1*df_final_test.weight_in + 2*df_final_test.weight_out)
if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'):
#page rank for source and destination in Train and Test
#if anything not there in train graph then adding mean page rank
df_final_train['page_rank_s'] = df_final_train.source_node.apply(lambda x:pr.get(x,mean_pr))
df_final_train['page_rank_d'] = df_final_train.destination_node.apply(lambda x:pr.get(x,mean_pr))
df_final_test['page_rank_s'] = df_final_test.source_node.apply(lambda x:pr.get(x,mean_pr))
df_final_test['page_rank_d'] = df_final_test.destination_node.apply(lambda x:pr.get(x,mean_pr))
#================================================================================
#Katz centrality score for source and destination in Train and test
#if anything not there in train graph then adding mean katz score
df_final_train['katz_s'] = df_final_train.source_node.apply(lambda x: katz.get(x,mean_katz))
df_final_train['katz_d'] = df_final_train.destination_node.apply(lambda x: katz.get(x,mean_katz))
df_final_test['katz_s'] = df_final_test.source_node.apply(lambda x: katz.get(x,mean_katz))
df_final_test['katz_d'] = df_final_test.destination_node.apply(lambda x: katz.get(x,mean_katz))
#================================================================================
#Hits algorithm score for source and destination in Train and test
#if anything not there in train graph then adding 0
df_final_train['hubs_s'] = df_final_train.source_node.apply(lambda x: hits[0].get(x,0))
df_final_train['hubs_d'] = df_final_train.destination_node.apply(lambda x: hits[0].get(x,0))
df_final_test['hubs_s'] = df_final_test.source_node.apply(lambda x: hits[0].get(x,0))
df_final_test['hubs_d'] = df_final_test.destination_node.apply(lambda x: hits[0].get(x,0))
#================================================================================
#Hits algorithm score for source and destination in Train and Test
#if anything not there in train graph then adding 0
df_final_train['authorities_s'] = df_final_train.source_node.apply(lambda x: hits[1].get(x,0))
df_final_train['authorities_d'] = df_final_train.destination_node.apply(lambda x: hits[1].get(x,0))
df_final_test['authorities_s'] = df_final_test.source_node.apply(lambda x: hits[1].get(x,0))
df_final_test['authorities_d'] = df_final_test.destination_node.apply(lambda x: hits[1].get(x,0))
#================================================================================
hdf = HDFStore('data/fea_sample/storage_sample_stage3.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()
else:
df_final_train = read_hdf('data/fea_sample/storage_sample_stage3.h5', 'train_df',mode='r')
df_final_test = read_hdf('data/fea_sample/storage_sample_stage3.h5', 'test_df',mode='r')
we will create these each of these features for both train and test data points
def svd(x, S):
try:
z = sadj_dict[x]
return S[z]
except:
return [0,0,0,0,0,0]
#for svd features to get feature vector creating a dict node val and index in svd vector
sadj_col = sorted(train_graph.nodes())
sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}
len(sadj_col)
sadj_dict
Adj = nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype()
U, s, V = svds(Adj, k = 6)
print('Adjacency matrix Shape',Adj.shape)
print('U Shape',U.shape)
print('V Shape',V.shape)
print('s Shape',s.shape)
if not os.path.isfile('data/fea_sample/storage_sample_stage4.h5'):
#===================================================================================================
df_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_train.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)
df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
#===================================================================================================
df_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_train.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
#===================================================================================================
df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_test.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)
df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
#===================================================================================================
df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_test.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
#===================================================================================================
hdf = HDFStore('data/fea_sample/storage_sample_stage4.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()
# prepared and stored the data from machine learning models
# pelase check the FB_Models.ipynb
#reading
from pandas import read_hdf
df_final_train = read_hdf('storage_sample_stage4.h5', 'train_df',mode='r')
df_final_test = read_hdf('storage_sample_stage4.h5', 'test_df',mode='r')
df_final_train.shape
df_final_test.shape
df_final_train.head(5)
df_final_train.columns
df_train= df_final_train[['source_node','destination_node']]
df_test= df_final_test[['source_node','destination_node']]
df_train.head(2)
df_test.head(2)
def compute_features_stage1(df_final):
num_followers_d=[]
num_followees_d=[]
for i,row in df_final.iterrows():
try:
d1=set(train_graph.predecessors(row['destination_node']))
d2=set(train_graph.successors(row['destination_node']))
except:
d1 = set()
d2 = set()
num_followers_d.append(len(d1))
num_followees_d.append(len(d2))
return num_followers_d, num_followees_d
df_train['num_followers_d'],df_train['num_followees_d']= compute_features_stage1(df_train)
df_test['num_followers_d'],df_test['num_followees_d']= compute_features_stage1(df_test)
df_train.head(3)
df_test.head(2)
df_train.num_followers_d.unique()
df_test.num_followers_d.unique()
d_followers_tr= df_train.num_followers_d.values
type(d_followers_tr)
df_final_train['num_followers_d']= d_followers_tr
df_final_train.head(5)
d_followers_test= df_test.num_followers_d.values
df_final_test['num_followers_d']= d_followers_test
df_final_test.head(5)
#saving the csv's
df_final_train.to_csv('df_final_train.csv')
df_final_test.to_csv('df_final_test.csv')
# for train data
df_final_train['pa_followers']= (df_final_train.num_followers_s)*(df_final_train.num_followers_d)
df_final_train['pa_followees']= (df_final_train.num_followees_s)*(df_final_train.num_followees_d)
df_final_train.head(3)
# for test data
df_final_test['pa_followers']= (df_final_test.num_followers_s)*(df_final_test.num_followers_d)
df_final_test['pa_followees']= (df_final_test.num_followees_s)*(df_final_test.num_followees_d)
df_final_test.head(3)
df_final_train.columns
#train data
#source nodes
s1= df_final_train.svd_u_s_1.values
s2= df_final_train.svd_u_s_2.values
s3= df_final_train.svd_u_s_3.values
s4= df_final_train.svd_u_s_4.values
s5= df_final_train.svd_u_s_5.values
s6= df_final_train.svd_u_s_6.values
s7= df_final_train.svd_v_s_1.values
s8= df_final_train.svd_v_s_2.values
s9= df_final_train.svd_v_s_3.values
s10= df_final_train.svd_v_s_4.values
s11= df_final_train.svd_v_s_5.values
s12= df_final_train.svd_v_s_6.values
#destination nodes
d1= df_final_train.svd_u_d_1.values
d2= df_final_train.svd_u_d_2.values
d3= df_final_train.svd_u_d_3.values
d4= df_final_train.svd_u_d_4.values
d5= df_final_train.svd_u_d_5.values
d6= df_final_train.svd_u_d_6.values
d7= df_final_train.svd_v_d_1.values
d8= df_final_train.svd_v_d_2.values
d9= df_final_train.svd_v_d_3.values
d10= df_final_train.svd_v_d_4.values
d11= df_final_train.svd_v_d_5.values
d12= df_final_train.svd_v_d_6.values
# calculating dot product & then assigning the result to the dataframe
svd_dot=[]
for i in range(len(s1)):
s=[]
d=[]
s.append(s1[i])
s.append(s2[i])
s.append(s3[i])
s.append(s4[i])
s.append(s5[i])
s.append(s6[i])
s.append(s7[i])
s.append(s8[i])
s.append(s9[i])
s.append(s10[i])
s.append(s11[i])
s.append(s12[i])
d.append(d1[i])
d.append(d2[i])
d.append(d3[i])
d.append(d4[i])
d.append(d5[i])
d.append(d6[i])
d.append(d7[i])
d.append(d8[i])
d.append(d9[i])
d.append(d10[i])
d.append(d11[i])
d.append(d12[i])
svd_dot.append(np.dot(s,d)) ##https://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html
df_final_train['svd_dot']=svd_dot
df_final_train.head(3)
#test data
#source nodes
s1= df_final_test.svd_u_s_1.values
s2= df_final_test.svd_u_s_2.values
s3= df_final_test.svd_u_s_3.values
s4= df_final_test.svd_u_s_4.values
s5= df_final_test.svd_u_s_5.values
s6= df_final_test.svd_u_s_6.values
s7= df_final_test.svd_v_s_1.values
s8= df_final_test.svd_v_s_2.values
s9= df_final_test.svd_v_s_3.values
s10= df_final_test.svd_v_s_4.values
s11= df_final_test.svd_v_s_5.values
s12= df_final_test.svd_v_s_6.values
#destination nodes
d1= df_final_test.svd_u_d_1.values
d2= df_final_test.svd_u_d_2.values
d3= df_final_test.svd_u_d_3.values
d4= df_final_test.svd_u_d_4.values
d5= df_final_test.svd_u_d_5.values
d6= df_final_test.svd_u_d_6.values
d7= df_final_test.svd_v_d_1.values
d8= df_final_test.svd_v_d_2.values
d9= df_final_test.svd_v_d_3.values
d10= df_final_test.svd_v_d_4.values
d11= df_final_test.svd_v_d_5.values
d12= df_final_test.svd_v_d_6.values
# calculating dot product & then assigning the result to the dataframe
svd_dot=[]
for i in range(len(s1)):
s=[]
d=[]
s.append(s1[i])
s.append(s2[i])
s.append(s3[i])
s.append(s4[i])
s.append(s5[i])
s.append(s6[i])
s.append(s7[i])
s.append(s8[i])
s.append(s9[i])
s.append(s10[i])
s.append(s11[i])
s.append(s12[i])
d.append(d1[i])
d.append(d2[i])
d.append(d3[i])
d.append(d4[i])
d.append(d5[i])
d.append(d6[i])
d.append(d7[i])
d.append(d8[i])
d.append(d9[i])
d.append(d10[i])
d.append(d11[i])
d.append(d12[i])
svd_dot.append(np.dot(s,d)) ##https://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html
df_final_test['svd_dot']=svd_dot
df_final_test.head(3)
hdf = HDFStore('storage_sample_stage5.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()
#saving the csv's
df_final_train.to_csv('df_final_train.csv')
df_final_test.to_csv('df_final_test.csv')
#Importing Libraries
# please do go through this python notebook:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd#pandas to create small dataframes
import datetime #Convert to unix time
import time #Convert to unix time
# if numpy is not installed already : pip3 install numpy
import numpy as np#Do aritmetic operations on arrays
# matplotlib: used to plot graphs
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns#Plots
from matplotlib import rcParams#Size of plots
from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering
import math
import pickle
import os
# to install xgboost: pip3 install xgboost
#import xgboost as xgb
import warnings
import networkx as nx
import pdb
import pickle
from pandas import HDFStore,DataFrame
from pandas import read_hdf
from scipy.sparse.linalg import svds, eigs
import gc
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
#reading
from pandas import read_hdf
df_final_train = read_hdf('storage_sample_stage5.h5', 'train_df',mode='r')
df_final_test = read_hdf('storage_sample_stage5.h5', 'test_df',mode='r')
df_final_train.columns
df_final_train.head(2)
y_train = df_final_train.indicator_link
y_test = df_final_test.indicator_link
df_final_train.drop(['source_node', 'destination_node','indicator_link'],axis=1,inplace=True)
df_final_test.drop(['source_node', 'destination_node','indicator_link'],axis=1,inplace=True)
estimators = [10,50,100,250,450]
train_scores = []
test_scores = []
for i in estimators:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=5, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=52, min_samples_split=120,
min_weight_fraction_leaf=0.0, n_estimators=i, n_jobs=-1,random_state=25,verbose=0,warm_start=False)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
test_scores.append(test_sc)
train_scores.append(train_sc)
print('Estimators = ',i,'Train Score',train_sc,'test Score',test_sc)
plt.plot(estimators,train_scores,label='Train Score')
plt.plot(estimators,test_scores,label='Test Score')
plt.xlabel('Estimators')
plt.ylabel('Score')
plt.title('Estimators vs score at depth of 5')
depths = [3,9,11,15,20,35,50,70,130]
train_scores = []
test_scores = []
for i in depths:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=i, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=52, min_samples_split=120,
min_weight_fraction_leaf=0.0, n_estimators=115, n_jobs=-1,random_state=25,verbose=0,warm_start=False)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
test_scores.append(test_sc)
train_scores.append(train_sc)
print('depth = ',i,'Train Score',train_sc,'test Score',test_sc)
plt.plot(depths,train_scores,label='Train Score')
plt.plot(depths,test_scores,label='Test Score')
plt.xlabel('Depth')
plt.ylabel('Score')
plt.title('Depth vs score at depth of 5 at estimators = 115')
plt.show()
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
param_dist = {"n_estimators":sp_randint(105,125),
"max_depth": sp_randint(10,15),
"min_samples_split": sp_randint(110,190),
"min_samples_leaf": sp_randint(25,65)}
clf = RandomForestClassifier(random_state=25,n_jobs=-1)
rf_random = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=5,cv=10,scoring='f1',random_state=25)
rf_random.fit(df_final_train,y_train)
print('mean test scores',rf_random.cv_results_['mean_test_score'])
print('mean train scores',rf_random.cv_results_['mean_train_score'])
print(rf_random.best_estimator_)
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=14, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=28, min_samples_split=111,
min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=-1,
oob_score=False, random_state=25, verbose=0, warm_start=False)
clf.fit(df_final_train,y_train)
y_train_pred = clf.predict(df_final_train)
y_test_pred = clf.predict(df_final_test)
from sklearn.metrics import f1_score
print('Train f1 score',f1_score(y_train,y_train_pred))
print('Test f1 score',f1_score(y_test,y_test_pred))
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
plt.figure(figsize=(20,4))
labels = [0,1]
# representing A in heatmap format
cmap=sns.light_palette("blue")
plt.subplot(1, 3, 1)
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Confusion matrix")
plt.subplot(1, 3, 2)
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Precision matrix")
plt.subplot(1, 3, 3)
# representing B in heatmap format
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Recall matrix")
plt.show()
print('Train confusion_matrix')
plot_confusion_matrix(y_train,y_train_pred)
print('Test confusion_matrix')
plot_confusion_matrix(y_test,y_test_pred)
from sklearn.metrics import roc_curve, auc
fpr,tpr,ths = roc_curve(y_test,y_test_pred)
auc_sc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='navy',label='ROC curve (area = %0.2f)' % auc_sc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic with test data')
plt.legend()
plt.show()
features = df_final_train.columns
importances = clf.feature_importances_
indices = (np.argsort(importances))[-25:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
#https://dask-ml.readthedocs.io/en/stable/modules/generated/dask_ml.xgboost.XGBClassifier.html
#https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
xgb = XGBClassifier()
parameters = {'n_estimators': [10,50,100,250,350,450],'max_depth': [4,10,12,15,20,35]}
clf1 = RandomizedSearchCV(xgb, parameters, cv=10, scoring='f1',return_train_score=True,n_jobs=-1)
rs1 = clf1.fit(df_final_train, y_train)
df=pd.DataFrame(clf1.cv_results_)
df.head(2)
df.columns
import pandas as pd
df= pd.read_csv('hyp.csv')
%matplotlib inline
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
init_notebook_mode(connected=False)
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_train_score'], name = 'train')
trace2 = go.Scatter3d(x=df['param_n_estimators'],y=df['param_max_depth'],z=df['mean_test_score'], name = 'Cross validation')
data = [trace1, trace2]
enable_plotly_in_cell()
layout = go.Layout(scene = dict(
xaxis = dict(title='Estimators'),
yaxis = dict(title='Max_depth'),
zaxis = dict(title='F1 score'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
print(clf1.best_estimator_)
xgb= XGBClassifier(n_estimators= 250 , max_depth= 10)
xgb.fit(df_final_train, y_train)
y_train_pred = xgb.predict(df_final_train)
y_test_pred = xgb.predict(df_final_test)
from sklearn.metrics import f1_score
print('Train f1 score',f1_score(y_train,y_train_pred))
print('Test f1 score',f1_score(y_test,y_test_pred))
print('Train confusion_matrix')
plot_confusion_matrix(y_train,y_train_pred)
print('Test confusion_matrix')
plot_confusion_matrix(y_test,y_test_pred)
features = df_final_train.columns
importances = xgb.feature_importances_
indices = (np.argsort(importances))[-25:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
#Ref: http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
print('Summary')
x.field_names = ["Model","max_depth","n_estimators" ,"Test F1 score"]
x.add_row(["Random Forest", 14, 121, 0.92])
x.add_row(["XGBoost", 10, 250, 0.93])
print(x)
Conclusion: